# Computations
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we demonstrate calculting RFM), which is a method used for analyzing customer value, for Online Retail Dataset from UC Irvine Machine Learning Repository.
This is a transnational data set that contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.
Data = pd.read_excel('Data/Online Retail.xlsx')
display(display(Data.head(10)))
def Data_Plot(Inp):
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info.index.name = 'Features'
data_info = data_info.reset_index(drop = False)
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type', text = 'Data Type',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1), width = 700)
fig.update_traces(texttemplate= 6*' ' + '%{label}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
fig.show()
Data_Plot(Data)
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
| 5 | 536365 | 22752 | SET 7 BABUSHKA NESTING BOXES | 2 | 2010-12-01 08:26:00 | 7.65 | 17850.0 | United Kingdom |
| 6 | 536365 | 21730 | GLASS STAR FROSTED T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 4.25 | 17850.0 | United Kingdom |
| 7 | 536366 | 22633 | HAND WARMER UNION JACK | 6 | 2010-12-01 08:28:00 | 1.85 | 17850.0 | United Kingdom |
| 8 | 536366 | 22632 | HAND WARMER RED POLKA DOT | 6 | 2010-12-01 08:28:00 | 1.85 | 17850.0 | United Kingdom |
| 9 | 536367 | 84879 | ASSORTED COLOUR BIRD ORNAMENT | 32 | 2010-12-01 08:34:00 | 1.69 | 13047.0 | United Kingdom |
None
Droping NaN Values
Data.dropna(inplace = True)
Removing duplicated entries:
print(Fore.RED + 'Number of Duplicated Entries:' + Style.RESET_ALL, ' = %i' % Data.duplicated().sum())
Data.drop_duplicates(inplace = True)
Number of Duplicated Entries: = 5225
print(Back.CYAN +'Data size:' + Style.RESET_ALL, '%i rows and %i columns' % Data.shape)
print(Back.MAGENTA +'Timeline:' + Style.RESET_ALL, 'From %s till %s' % (Data['InvoiceDate'].min(), Data['InvoiceDate'].max()))
Data size: 401604 rows and 8 columns Timeline: From 2010-12-01 08:26:00 till 2011-12-09 12:50:00
Next, we create some new features.
| Feature | Description |
|---|---|
| Total_Spending | Total amount of spending |
| Recency | Days since the last purchase |
| Frequency | Transactions numbers over a defined period |
| Monetary | Total spending over a defined period |
| RFM Group | Assigning a customer to an RFM group based on their recency, frequency, monetary |
Customers here are divided into several groups based on their recency, frequency, monetary.
# Total Spending
Data['Total_Spending'] = Data['Quantity'] * Data['UnitPrice']
# Recency
LastDay = Data['InvoiceDate'].max() + timedelta(days=1)
Temp0 = pd.DataFrame(Data.groupby(['CustomerID'])['InvoiceDate'].max())
Temp0.columns = ['Recency']
Temp0 = Temp0.reset_index(drop = False)
Temp0.iloc[:,1] = Temp0.iloc[:,1].apply(lambda x: (Data['InvoiceDate'].max() + timedelta(days=1) - x).days)
# Frequency
Temp1 = pd.DataFrame(Data.groupby(['CustomerID'])['InvoiceDate'].count())
Temp1.columns = ['Frequency']
Temp1 = Temp1.reset_index(drop = False)
# Monetary
Temp2 = pd.DataFrame(Data.groupby(['CustomerID'])['Total_Spending'].sum())
Temp2.columns = ['Monetary']
Temp2 = Temp2.reset_index(drop = False)
# Merging
df = Temp0.merge(Temp1, left_on='CustomerID', right_on='CustomerID')
df = df.merge(Temp2, left_on='CustomerID', right_on='CustomerID')
del Temp0, Temp1, Temp2
# RFM Groups
df['R'] = pd.qcut(df['Recency'], q=4, labels=np.arange(4, 0, -1))
df['F'] = pd.qcut(df['Frequency'], q=4, labels=np.arange(1, 5, 1))
df['M'] = pd.qcut(df['Monetary'], q=4, labels=np.arange(1, 5, 1))
def RFM_df(x):
return 'R' + str(x['R']) + 'F' + str(x['F']) + 'M' + str(x['M'])
df['RFM_Group'] = df.apply(RFM_df, axis=1)
display(df.head().style.hide_index())
| CustomerID | Recency | Frequency | Monetary | R | F | M | RFM_Group |
|---|---|---|---|---|---|---|---|
| 12346.000000 | 326 | 2 | 0.000000 | 1 | 1 | 1 | R1.0F1.0M1.0 |
| 12347.000000 | 2 | 182 | 4310.000000 | 4 | 4 | 4 | R4.0F4.0M4.0 |
| 12348.000000 | 75 | 31 | 1797.240000 | 2 | 2 | 4 | R2.0F2.0M4.0 |
| 12349.000000 | 19 | 73 | 1757.550000 | 3 | 3 | 4 | R3.0F3.0M4.0 |
| 12350.000000 | 310 | 17 | 334.400000 | 1 | 1 | 2 | R1.0F1.0M2.0 |
A representation of the distribution of data can be found using pandas hist function:
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(14, 6), sharex = False)
_ = df.hist(ax = ax[0], bins=20, column= 'Recency', grid=True, color='HotPink', edgecolor='Indigo', hatch = '//')
_ = ax[0].set_xlim([0,4e2])
_ = ax[0].set_ylim([0,14e2])
_ = df.hist(ax = ax[1], bins=100, column= 'Frequency', grid=True, color='LimeGreen', edgecolor='DarkGreen', hatch = '//')
_ = ax[1].set_xlim([0,1e3])
_ = ax[1].set_ylim([0,4e3])
_ = df.hist(ax = ax[2], bins=100, column= 'Monetary', grid=True, color='LightBlue', edgecolor='Navy', hatch = '//')
_ = ax[2].set_xlim([0,2e4])
_ = ax[2].set_ylim([0,4e3])
Group = df.groupby('RFM_Group')[['R','F','M']].count().reset_index(drop = False)
Group = pd.melt(Group, id_vars=['RFM_Group'], value_vars=['R','F','M'], var_name='RFM', value_name='Count')
fig, ax = plt.subplots(1, 1, figsize=(16, 22), sharex=True)
_ = sns.barplot(y="RFM_Group", x="Count", hue="RFM", data=Group, ax = ax,
palette = sns.color_palette(["#9b59b6","#e74c3c", "#2ecc71"]), edgecolor='k')
_ = ax.set_xlim([0,500])
_ = ax.set_xlabel('Count')
_ = ax.set_ylabel('RFM Group')
_ = ax.legend(scatterpoints=1, frameon=False, labelspacing=1, title='')
df['RFM_Score'] = df[['R','F','M']].sum(axis=1).astype(int)
df['RFM_Level'] = df['RFM_Score'].apply(lambda x: x - df['RFM_Score'].min() + 1)
display(df.head().style.hide_index())
| CustomerID | Recency | Frequency | Monetary | R | F | M | RFM_Group | RFM_Score | RFM_Level |
|---|---|---|---|---|---|---|---|---|---|
| 12346.000000 | 326 | 2 | 0.000000 | 1 | 1 | 1 | R1.0F1.0M1.0 | 3 | 1 |
| 12347.000000 | 2 | 182 | 4310.000000 | 4 | 4 | 4 | R4.0F4.0M4.0 | 12 | 10 |
| 12348.000000 | 75 | 31 | 1797.240000 | 2 | 2 | 4 | R2.0F2.0M4.0 | 8 | 6 |
| 12349.000000 | 19 | 73 | 1757.550000 | 3 | 3 | 4 | R3.0F3.0M4.0 | 10 | 8 |
| 12350.000000 | 310 | 17 | 334.400000 | 1 | 1 | 2 | R1.0F1.0M2.0 | 4 | 2 |
Group = pd.DataFrame(df.groupby('RFM_Level')['CustomerID'].count())
Group.columns = ['Count']
Group = Group.reset_index(drop = False)
Group = Group.sort_values(by=['RFM_Level'])
Group1 = df.groupby(['RFM_Level'])[['Recency', 'Frequency', 'Monetary']].mean().round(2).reset_index()
Group = Group.merge(Group1, on='RFM_Level')
display(Group)
del Group1
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))
_ = sns.barplot(x="RFM_Level", y="Count", palette="Purples_d", edgecolor = 'White', hatch = '//', data = Group, ax = ax[0])
_ = sns.barplot(x="RFM_Level", y="Count", facecolor = 'None', edgecolor = 'Black', data = Group, ax = ax[0])
_ = ax[0].set_ylim([0,600])
_ = ax[0].set_xlabel('RFM Level')
_ = Group.plot.pie(y='Count', startangle=90, legend=False,label='',ax=ax[1], autopct='%1.1f%%',
colors = sns.color_palette("muted", len(Group)), fontsize=12)
_ = ax[1].legend(bbox_to_anchor=(1, .8), fontsize=11, title='RFM Levels')
| RFM_Level | Count | Recency | Frequency | Monetary | |
|---|---|---|---|---|---|
| 0 | 1 | 393 | 264.75 | 7.85 | 109.25 |
| 1 | 2 | 390 | 174.88 | 13.79 | 226.57 |
| 2 | 3 | 515 | 152.50 | 20.92 | 342.69 |
| 3 | 4 | 469 | 94.59 | 28.11 | 490.78 |
| 4 | 5 | 439 | 79.49 | 39.22 | 722.68 |
| 5 | 6 | 467 | 62.76 | 56.08 | 965.48 |
| 6 | 7 | 413 | 44.37 | 77.38 | 1341.58 |
| 7 | 8 | 442 | 31.52 | 112.60 | 1819.21 |
| 8 | 9 | 374 | 20.47 | 190.42 | 3892.08 |
| 9 | 10 | 470 | 6.71 | 367.72 | 8848.13 |